#1. How many breweries are present in each state?
#Solution:
#We can observe from the ordered bar plot the number of breweries that are present in each state.

#libraries
library(tm) #text mining library provides the stopwords() function
## Loading required package: NLP
library(tidyr)
library(plyr)
library(jsonlite)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.0
## ✔ tibble  3.1.8     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::arrange()    masks plyr::arrange()
## ✖ purrr::compact()    masks plyr::compact()
## ✖ dplyr::count()      masks plyr::count()
## ✖ dplyr::failwith()   masks plyr::failwith()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ purrr::flatten()    masks jsonlite::flatten()
## ✖ dplyr::id()         masks plyr::id()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::mutate()     masks plyr::mutate()
## ✖ dplyr::rename()     masks plyr::rename()
## ✖ dplyr::summarise()  masks plyr::summarise()
## ✖ dplyr::summarize()  masks plyr::summarize()
library(ggplot2)
library(stringr)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggthemes)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(class)
library(e1071)
library(usmap)
#The output of the "codebook" function will be a table that provides information about each variable in the dataset, including its name, label (if any), data type, number of missing values, and summary statistics such as mean, standard deviation, and quartile values.
#Codebook 
#library(memisc)
#codebook(Breweries)
#codebook(Beers)
#codebook(merged_data)

#import  data, merge and remove na values
Breweries = read.csv("/Users/alrousan95/Downloads/Breweries.csv", header = TRUE)
Beers = read.csv("/Users/alrousan95/Downloads/Beers.csv", header = TRUE)
merged_data <- merge(Breweries, Beers, by.x = "Brew_ID", by.y = "Brewery_id", all = TRUE)
merged_data <- na.omit(merged_data)
#1411x11
dim(merged_data)
## [1] 1405   10
dim(Breweries)
## [1] 558   4
head(Breweries)
##   Brew_ID                      Name          City State
## 1       1        NorthGate Brewing    Minneapolis    MN
## 2       2 Against the Grain Brewery    Louisville    KY
## 3       3  Jack's Abby Craft Lagers    Framingham    MA
## 4       4 Mike Hess Brewing Company     San Diego    CA
## 5       5   Fort Point Beer Company San Francisco    CA
## 6       6     COAST Brewing Company    Charleston    SC
#how many states are in the usa, by googling we see that there are 50 states

#US Heat map of the number of Breweries for each state

#alternative way to create US heat map, but does not knit for some reason
#library(usmap)
#US heat map of breweries 
#nationBrewPlot <- plot_usmap(data = statepop, values = 'breweriesState',labels=TRUE, color = "grey73") + scale_fill_continuous(low = "white", high = "red", name = "Brewery Count", label = scales::comma) + theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
#nationBrewPlot

library(dplyr)
library(plotly)
# group by state and count number of breweries for each state
Breweries_by_State <- Breweries %>%
  group_by(State) %>%
  dplyr::summarize(num_Breweries = n()) 

Breweries_by_State$region <- c("alabama", "alaska", "arizona", "arkansas", "california", 
                "colorado", "connecticut","district of columbia", "delaware", "florida", "georgia", 
                "hawaii", "idaho", "illinois", "indiana", "iowa", "kansas", 
                "kentucky", "louisiana", "maine", "maryland", "massachusetts", 
                "michigan", "minnesota", "mississippi", "missouri", "montana", 
                "nebraska", "nevada", "new hampshire", "new jersey", "new mexico", 
                "new york", "north carolina", "north dakota", "ohio", "oklahoma", 
                "oregon", "pennsylvania", "rhode island", "south carolina", "south dakota", 
                "tennessee", "texas", "utah", "vermont", "virginia", "washington", 
                "west virginia", "wisconsin", "wyoming")


Breweries_by_State
## # A tibble: 51 × 3
##    State num_Breweries region              
##    <chr>         <int> <chr>               
##  1 " AK"             7 alabama             
##  2 " AL"             3 alaska              
##  3 " AR"             2 arizona             
##  4 " AZ"            11 arkansas            
##  5 " CA"            39 california          
##  6 " CO"            47 colorado            
##  7 " CT"             8 connecticut         
##  8 " DC"             1 district of columbia
##  9 " DE"             2 delaware            
## 10 " FL"            15 florida             
## # … with 41 more rows
dim(Breweries_by_State)
## [1] 51  3
#drop that first column
Breweries_by_State = Breweries_by_State[-1]
head(Breweries_by_State)
## # A tibble: 6 × 2
##   num_Breweries region    
##           <int> <chr>     
## 1             7 alabama   
## 2             3 alaska    
## 3             2 arizona   
## 4            11 arkansas  
## 5            39 california
## 6            47 colorado
states <- map_data("state")
head(states)
##        long      lat group order  region subregion
## 1 -87.46201 30.38968     1     1 alabama      <NA>
## 2 -87.48493 30.37249     1     2 alabama      <NA>
## 3 -87.52503 30.37249     1     3 alabama      <NA>
## 4 -87.53076 30.33239     1     4 alabama      <NA>
## 5 -87.57087 30.32665     1     5 alabama      <NA>
## 6 -87.58806 30.32665     1     6 alabama      <NA>
#merge states and Breweries_by_State tables
#If all.x = TRUE, then all rows from the "x" data frame will be included in the output, even if there is no matching row in the "y" data frame. If all.x = FALSE (the default), then only the rows from the "x" data frame that have a match in the "y" data frame will be included in the output.
map.df <- merge(states,Breweries_by_State, by="region", all.x=T)
head(map.df)
##    region      long      lat group order subregion num_Breweries
## 1 alabama -87.46201 30.38968     1     1      <NA>             7
## 2 alabama -87.48493 30.37249     1     2      <NA>             7
## 3 alabama -87.52503 30.37249     1     3      <NA>             7
## 4 alabama -87.53076 30.33239     1     4      <NA>             7
## 5 alabama -87.57087 30.32665     1     5      <NA>             7
## 6 alabama -87.58806 30.32665     1     6      <NA>             7
#map.df
#order 
map.df <- map.df[order(map.df$order),]
head(map.df)
##    region      long      lat group order subregion num_Breweries
## 1 alabama -87.46201 30.38968     1     1      <NA>             7
## 2 alabama -87.48493 30.37249     1     2      <NA>             7
## 3 alabama -87.52503 30.37249     1     3      <NA>             7
## 4 alabama -87.53076 30.33239     1     4      <NA>             7
## 5 alabama -87.57087 30.32665     1     5      <NA>             7
## 6 alabama -87.58806 30.32665     1     6      <NA>             7
#plot Map
ggplot(map.df, aes(x=long,y=lat,group=group))+
  geom_polygon(aes(fill=num_Breweries))+
  geom_path()+ 
   scale_fill_gradient(low = "white", high = "red", na.value = "grey90")+
  ggtitle("Number of Breweries by State")+
coord_map() +
theme(legend.position = "bottom", legend.justification = "left") +
labs(fill = "Brewery Count")

#Bar Plot of Number of Breweries

#bar plot of the number of breweries by state in descending order from left to right
Breweries_by_State <- Breweries %>%
group_by(State) %>%
dplyr::summarize(num_Breweries = n()) 
gg <- Breweries_by_State %>% 
ggplot(aes(x = reorder(State, -num_Breweries), y = num_Breweries, fill = State)) +
geom_bar(stat = "identity") +
geom_text(aes(label = num_Breweries), vjust = -0.5, size = 3) +
xlab("State") +
ylab("Number of breweries") +
ggtitle("Total Brewery Count Per State") +
theme(axis.text.x = element_text(angle = 45, hjust = 0.5, size = 8), axis.text.y=element_text(size=13), text=element_text(size=20), legend.position = "none")
ggplotly(gg)
#max number of breweries
max_breweries <- max(Breweries_by_State$num_Breweries)
max_breweries
## [1] 47
#min number of breweries
min_breweries <- min(Breweries_by_State$num_Breweries)
min_breweries
## [1] 1
# Filter for states with only one brewery
one_brewery_states <- Breweries_by_State %>% 
filter(num_Breweries == 1)
# View the filtered data
print(one_brewery_states)
## # A tibble: 4 × 2
##   State num_Breweries
##   <chr>         <int>
## 1 " DC"             1
## 2 " ND"             1
## 3 " SD"             1
## 4 " WV"             1
# Filter for states with only one brewery
fortyseven_brewery_states <- Breweries_by_State %>% 
filter(num_Breweries == 47)

# View the filtered data
print(fortyseven_brewery_states)
## # A tibble: 1 × 2
##   State num_Breweries
##   <chr>         <int>
## 1 " CO"            47
#get mean number of Breweries for all states
mean(Breweries_by_State$num_Breweries)
## [1] 10.94118
#the mean number of breweries for states in the US are around 10.94 or 11

#2. Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file. (RMD only, this does not need to be included in the presentation or the deck.)
#Solution
#We merged the data and checked that the first 6 and last 6 observations matched up with the head and tail function. This match show that the Breweries and Beer data were merged correctly.

#get a feel for data by returning first 6 rows
head(Beers)
##                  Name Beer_ID   ABV IBU Brewery_id
## 1            Pub Beer    1436 0.050  NA        409
## 2         Devil's Cup    2265 0.066  NA        178
## 3 Rise of the Phoenix    2264 0.071  NA        178
## 4            Sinister    2263 0.090  NA        178
## 5       Sex and Candy    2262 0.075  NA        178
## 6        Black Exodus    2261 0.077  NA        178
##                            Style Ounces
## 1            American Pale Lager     12
## 2        American Pale Ale (APA)     12
## 3                   American IPA     12
## 4 American Double / Imperial IPA     12
## 5                   American IPA     12
## 6                  Oatmeal Stout     12
#get dimension of the data via number of rows and number columns
dim(Breweries)
## [1] 558   4
dim(Beers)
## [1] 2410    7
#get names of columns
names(Breweries)
## [1] "Brew_ID" "Name"    "City"    "State"
names(Beers)
## [1] "Name"       "Beer_ID"    "ABV"        "IBU"        "Brewery_id"
## [6] "Style"      "Ounces"
#get data type of columns
str(Breweries)
## 'data.frame':    558 obs. of  4 variables:
##  $ Brew_ID: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Name   : chr  "NorthGate Brewing " "Against the Grain Brewery" "Jack's Abby Craft Lagers" "Mike Hess Brewing Company" ...
##  $ City   : chr  "Minneapolis" "Louisville" "Framingham" "San Diego" ...
##  $ State  : chr  " MN" " KY" " MA" " CA" ...
str(Beers)
## 'data.frame':    2410 obs. of  7 variables:
##  $ Name      : chr  "Pub Beer" "Devil's Cup" "Rise of the Phoenix" "Sinister" ...
##  $ Beer_ID   : int  1436 2265 2264 2263 2262 2261 2260 2259 2258 2131 ...
##  $ ABV       : num  0.05 0.066 0.071 0.09 0.075 0.077 0.045 0.065 0.055 0.086 ...
##  $ IBU       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Brewery_id: int  409 178 178 178 178 178 178 178 178 178 ...
##  $ Style     : chr  "American Pale Lager" "American Pale Ale (APA)" "American IPA" "American Double / Imperial IPA" ...
##  $ Ounces    : num  12 12 12 12 12 12 12 12 12 12 ...
#merge the data by primary key Brew_ID, since Brewery_id is the same thing but a foreign key in Beers data frame
merged_data <- merge(Breweries, Beers, by.x = "Brew_ID", by.y = "Brewery_id", all = TRUE)
#get a feel for data by returning first 6 rows
head(merged_data)
##   Brew_ID             Name.x        City State        Name.y Beer_ID   ABV IBU
## 1       1 NorthGate Brewing  Minneapolis    MN       Pumpion    2689 0.060  38
## 2       1 NorthGate Brewing  Minneapolis    MN    Stronghold    2688 0.060  25
## 3       1 NorthGate Brewing  Minneapolis    MN   Parapet ESB    2687 0.056  47
## 4       1 NorthGate Brewing  Minneapolis    MN  Get Together    2692 0.045  50
## 5       1 NorthGate Brewing  Minneapolis    MN Maggie's Leap    2691 0.049  26
## 6       1 NorthGate Brewing  Minneapolis    MN    Wall's End    2690 0.048  19
##                                 Style Ounces
## 1                         Pumpkin Ale     16
## 2                     American Porter     16
## 3 Extra Special / Strong Bitter (ESB)     16
## 4                        American IPA     16
## 5                  Milk / Sweet Stout     16
## 6                   English Brown Ale     16
#get dimension of the data via number of rows and number columns
#compare dimensions of the data frame so you can see how the data was merged
#Notice without removing the NA values we have 2410 rows (observations) we can see that there will be 10 columns instead of 11 since the merged data shares Breweries data set primary key Brew_ID 
dim(merged_data)
## [1] 2410   10
#get names of columns
names(merged_data)
##  [1] "Brew_ID" "Name.x"  "City"    "State"   "Name.y"  "Beer_ID" "ABV"    
##  [8] "IBU"     "Style"   "Ounces"
#get data type of columns
str(merged_data)
## 'data.frame':    2410 obs. of  10 variables:
##  $ Brew_ID: int  1 1 1 1 1 1 2 2 2 2 ...
##  $ Name.x : chr  "NorthGate Brewing " "NorthGate Brewing " "NorthGate Brewing " "NorthGate Brewing " ...
##  $ City   : chr  "Minneapolis" "Minneapolis" "Minneapolis" "Minneapolis" ...
##  $ State  : chr  " MN" " MN" " MN" " MN" ...
##  $ Name.y : chr  "Pumpion" "Stronghold" "Parapet ESB" "Get Together" ...
##  $ Beer_ID: int  2689 2688 2687 2692 2691 2690 2683 2686 2685 2684 ...
##  $ ABV    : num  0.06 0.06 0.056 0.045 0.049 0.048 0.042 0.08 0.125 0.077 ...
##  $ IBU    : int  38 25 47 50 26 19 42 68 80 25 ...
##  $ Style  : chr  "Pumpkin Ale" "American Porter" "Extra Special / Strong Bitter (ESB)" "American IPA" ...
##  $ Ounces : num  16 16 16 16 16 16 16 16 16 16 ...
##first 6 rows and last 6 rows columns header match means our data merged correctly
# Print the first 6 observations of the merged data
head(merged_data, n = 6)
##   Brew_ID             Name.x        City State        Name.y Beer_ID   ABV IBU
## 1       1 NorthGate Brewing  Minneapolis    MN       Pumpion    2689 0.060  38
## 2       1 NorthGate Brewing  Minneapolis    MN    Stronghold    2688 0.060  25
## 3       1 NorthGate Brewing  Minneapolis    MN   Parapet ESB    2687 0.056  47
## 4       1 NorthGate Brewing  Minneapolis    MN  Get Together    2692 0.045  50
## 5       1 NorthGate Brewing  Minneapolis    MN Maggie's Leap    2691 0.049  26
## 6       1 NorthGate Brewing  Minneapolis    MN    Wall's End    2690 0.048  19
##                                 Style Ounces
## 1                         Pumpkin Ale     16
## 2                     American Porter     16
## 3 Extra Special / Strong Bitter (ESB)     16
## 4                        American IPA     16
## 5                  Milk / Sweet Stout     16
## 6                   English Brown Ale     16
# Print the last 6 observations of the merged data
tail(merged_data, n = 6)
##      Brew_ID                        Name.x          City State
## 2405     556         Ukiah Brewing Company         Ukiah    CA
## 2406     557       Butternuts Beer and Ale Garrattsville    NY
## 2407     557       Butternuts Beer and Ale Garrattsville    NY
## 2408     557       Butternuts Beer and Ale Garrattsville    NY
## 2409     557       Butternuts Beer and Ale Garrattsville    NY
## 2410     558 Sleeping Lady Brewing Company     Anchorage    AK
##                         Name.y Beer_ID   ABV IBU                   Style Ounces
## 2405             Pilsner Ukiah      98 0.055  NA         German Pilsener     12
## 2406         Porkslap Pale Ale      49 0.043  NA American Pale Ale (APA)     12
## 2407           Snapperhead IPA      51 0.068  NA            American IPA     12
## 2408         Moo Thunder Stout      50 0.049  NA      Milk / Sweet Stout     12
## 2409  Heinnieweisse Weissebier      52 0.049  NA              Hefeweizen     12
## 2410 Urban Wilderness Pale Ale      30 0.049  NA        English Pale Ale     12

#3. Address the missing values in each column.
#Solution:
#We addressed the missing values in each column found in our data. Specifically, we found that there were some missing values in the ABV and IBU columns for quite a few beers across different breweries. So, we wanted to see the missing ABV and IBU values to get an idea as to how those data points are spread across the beers overall. #We can see the missing ABV values spread out over the many present ones in this dataset. This gives us confidence that suggests that the data is missing completely at random, so we do not need to include those beers with missing values in our analysis. #We plotted the missing IBU values alongside the present IBU values and the scatter plot presents the missing values that are spread out randomly across the dataset. This is good news, because if the missing values were systematic, it could indicate errors in the data collection process. This random distribution suggests that the missing values is due to chance and not related to any specific characteristics of the different beers in style, state location, or the dataset. Overall, this insight provides us with confidence that the records aren’t missing due to any specific bias or systematic errors.

#Delete all if missing completely at random
#First we need to check for missing values
dim(merged_data)
## [1] 2410   10
sum(is.na(merged_data))
## [1] 1067
str(merged_data)
## 'data.frame':    2410 obs. of  10 variables:
##  $ Brew_ID: int  1 1 1 1 1 1 2 2 2 2 ...
##  $ Name.x : chr  "NorthGate Brewing " "NorthGate Brewing " "NorthGate Brewing " "NorthGate Brewing " ...
##  $ City   : chr  "Minneapolis" "Minneapolis" "Minneapolis" "Minneapolis" ...
##  $ State  : chr  " MN" " MN" " MN" " MN" ...
##  $ Name.y : chr  "Pumpion" "Stronghold" "Parapet ESB" "Get Together" ...
##  $ Beer_ID: int  2689 2688 2687 2692 2691 2690 2683 2686 2685 2684 ...
##  $ ABV    : num  0.06 0.06 0.056 0.045 0.049 0.048 0.042 0.08 0.125 0.077 ...
##  $ IBU    : int  38 25 47 50 26 19 42 68 80 25 ...
##  $ Style  : chr  "Pumpkin Ale" "American Porter" "Extra Special / Strong Bitter (ESB)" "American IPA" ...
##  $ Ounces : num  16 16 16 16 16 16 16 16 16 16 ...
sum(is.na(merged_data$Brew_ID))
## [1] 0
sum(is.na(merged_data$Name.x))
## [1] 0
sum(is.na(merged_data$City))
## [1] 0
sum(is.na(merged_data$State))
## [1] 0
sum(is.na(merged_data$Name.y))
## [1] 0
sum(is.na(merged_data$Beer_ID))
## [1] 0
#62 missing values in the ABV column
sum(is.na(merged_data$ABV))
## [1] 62
#998 missing values in the IBU column
sum(is.na(merged_data$IBU))
## [1] 1005
sum(is.na(merged_data$Style))
## [1] 0
sum(is.na(merged_data$Ounces))
## [1] 0
missing <- is.na(merged_data)
#missing
#Example of MCAR idea: The probability of missing does not depend on the data, the covariates: State, Style or Beer itself.  It is as if the wind blew away records at random and ABU and IBU were all equally likely to by blown away (missing).  List wise Deletion OK and Imputation OK!
#now we need to determine if the values are missing completely at random, so lets check this by plotting the data visually with a scatter plot
dim(merged_data)
## [1] 2410   10
#merged_data
#ABV missing values
#is.na will return TRUE if there is an NA value or FALSE if no NA value 
#plug in data,data1, and data2
#We can observe that if we order the data by state or by style we can observe that the data is still spread out through the data providing more confidence that the data for ABV are completely missing at random
data <- data.frame(x = is.na(merged_data$ABV)) 
data1 <- data.frame(x = is.na(merged_data$Style)) 
data2 <- data.frame(x = is.na(merged_data$State)) 
#Scatter Plot of Missing ABV Values  
#We can observe from this scatter plot that the ABV values that are missing for the beers are spread out over the many present ones in this data set. This suggests that the data is missing completely at random so we do not need to include those beers.
gg <- ggplot(data, aes(x = 1:nrow(data), y = x)) +
  geom_point(size = 1, height = .5, width = .5, color = "red") +
  xlab("Beers by row number") +
 ylab("ABV Values Present or Missing") +
 labs(title = "Scatter Plot of Missing ABU Values", size = 10) +
 scale_y_discrete(labels = c("Present", "Missing")) +
 scale_x_continuous(breaks = seq(0, nrow(data), by = 100)) +
 theme(axis.text.x = element_text(angle = 90, hjust  = 1)) + 
 theme_classic()
## Warning in geom_point(size = 1, height = 0.5, width = 0.5, color = "red"):
## Ignoring unknown parameters: `height` and `width`
ggplotly(gg)
#From the plots we can observe that the missing values for ABV are spread out overall the data.
#This suggest that the data missing completely at random so we can delete all the NA values.
#The missing values appear to be randomly distributed throughout the data set, it may suggest that the missingness is due to chance.
#The probability of missing does not depend on the data. 
#It is as if the wind blew away records at random and ABV, State,  were all equally likely to by blown away (missing). 
#From the plot we can observe that the missing values for ABV are spread out overall the data, 
#this suggest that the data missing completely at random so we can delete all the NA values.
#the missing values appear to be randomly distributed throughout the dataset, it may suggest that the missingness is due to chance.
#The probability of missing does not depend on the data. 
#It is as if the wind blew away records at random and ABV, State,  were all equally likely to by blown away (missing). 

#IBU
#Scatter Plot of Missing IBU Values
#Let's take a closer look at the missing IBU values in our dataset. We plotted the beers with missing IBU values against the present IBU values, and the resulting scatter plot shows us that the missing values are spread out randomly across the dataset. This is good news, because if the missingness was systematic, it could indicate errors in the data collection process.

#This random distribution suggests that the missingness is due to chance, and is not related to any specific characteristics of the beers or the dataset. In other words, it's as if the wind blew away records at random for ABV and IBU values for those beers, making the missingness completely random. This is what we call missing completely at random (MCAR).
#Overall, this insight helps us to better understand the nature of the missing IBU values and provides us with more confidence that the records aren’t missing due to any specific bias or systematic error in our dataset.
#plug in data3,data4, and data5
#We can observe that if we order the data by state or by style we can observe that the data is still spread out through the data providing more confidence that the data for IBU are completely missing at random.
data3 <- data.frame(x = is.na(merged_data$IBU)) 
data4 <- data.frame(x = is.na(merged_data$Style)) 
data5 <- data.frame(x = is.na(merged_data$State)) 

gg <- ggplot(data3, aes(x = 1:nrow(data), y = x)) +
  geom_point(size = 1, height = .5, width = .5, color = "red") +
  xlab("Beers by row number") +
  ylab("IBU Values Present or Missing") +
  ggtitle("Scatter Plot of Missing IBU values") +
  scale_y_discrete(labels = c("Present", "Missing")) +
  scale_x_continuous(breaks = seq(0, nrow(data), by = 100)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme_classic()
## Warning in geom_point(size = 1, height = 0.5, width = 0.5, color = "red"):
## Ignoring unknown parameters: `height` and `width`
ggplotly(gg)
#From the plot we can observe that the missing values for IBU are spread out overall the data, 
#this suggest that the data missing completely at random so we can delete all the NA values. 
#Given that the ABV and IBU are suggested to be completely missing at random since the scatter plots shows wide spread
#We can remove these missing values.
#We will assume that the abv and IBU are missing at random.
dim(merged_data)
## [1] 2410   10
#merged_data

#4. Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
#Solution:
#Moving on, we computed the median Alcohol By Volume or ABV. It is a standard measure used to indicate the alcoholic percentage in beverages. And the International Bitterness Units (IBU) measures the bitterness of the beer, which is primarily determined by the number of hops used during the brewing process. The IBU’s are used to provide a standardized scale for measuring bitterness from 0 (no bitterness) to over a 100 (extremely bitter). We showed this with two barplots for Median ABV and Median IBU respectively.

#ABV & IBU by State 
#Moving on, we computed the median alcohol ABV which stands for Alcohol By Volume. It is a standard measure used to indicate the percentage of alcohol in alcoholic beverages and the (IBU) which stands for International Bitterness Units. It's worth a reminder that it is a measure of the bitterness of beer, which is primarily determined by the amount of hops used during the brewing process. IBUs are used to provide a standardized scale for measuring the bitterness of different beers, and can range from 0 (no bitterness) to well over 100 (extremely bitter). We plotted the Median ABV and IBU for each state and plotted a bar chart to compare them.

#Group the data by State and compute median ABV and IBU for each group
#In the code above, we first group the data by State using the group_by function and then use the summarize function to compute the median ABV and IBU for each group. We then use ggplot2 to plot a bar chart comparing the median ABV and IBU for each state. The geom_bar function is used to create the bars for each variable (ABV and IBU), and the fill parameter is used to set the color of the bars. The labs function is used to set the title and axis labels,Finally, we use the scale_y_continuous function to format the y-axis labels so they don't display in scientific notation.

#Plot the bar chart for ABV
#ABV by State
#From this ABV bar plot, we can see that the District of Columbia and Kentucky had the highest median ABV of 6.25%, while Utah had the lowest median ABV of 4%. 
#na.rm is an argument in many R functions that indicates whether to remove missing values or not. 
#When na.rm is set to TRUE, any missing values in the data are removed before computation.
#For example, in the mean() function, if na.rm is set to TRUE, any missing values in the data will be ignored when calculating the mean.
# Plot the bar chart
#observe that SD is missing median, since it has so few beweries, it just happen randomly that were missing data for it for IBU. I Imputed the data by looking them up online for the missing IBU values so that SD bar cn be shown.
#removed na values for all observation in merged_data data frame.
#remove na values from merged_data
merged_data <- na.omit(merged_data)
grouped_data <- merged_data %>% 
  group_by(State) %>% 
  summarize(median_ABV = median(ABV, na.rm = TRUE))
gg <- ggplot(grouped_data, aes(x = reorder(State, median_ABV), y = median_ABV)) +
  geom_bar(fill = "steelblue2", stat = "identity") +
  labs(title = "Median ABV by State",
       x = "State",
       y = "Median ABV") +
  theme_classic() +
  #theme(axis.text.x = element_text(size = 9, angle = 45, vjust = 1, hjust = 0.5)) 
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 0.5, size = 11), axis.text.y=element_text(size=13), text=element_text(size=20))
ggplotly(gg)
#Plot the bar chart for IBU
#IBU by State
#Now for the IBU bar plot, we can see in terms of IBU, Maine had the highest median IBU of 61, while Wisconsin had the lowest median IBU of 19. 
grouped_data <- merged_data %>% 
  group_by(State) %>% 
  summarize(median_IBU = median(IBU, na.rm = TRUE))
gg <- ggplot(grouped_data, aes(x = reorder(State, median_IBU), y = median_IBU)) +
  geom_bar(fill = "darkolivegreen3", stat = "identity") +
  labs(title = "Median IBU by State",
       x = "State",
       y = "Median IBU") +
  theme_classic() +
  #theme(axis.text.x = element_text(size = 9, angle = 45, vjust = 1, hjust = 0.5)) 
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 0.5, size = 11), axis.text.y=element_text(size=13), text=element_text(size=20))
ggplotly(gg) 

#5. Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
#Solution:
# We can observe from the bar plots that Colorado has the highest ABV beer and Oregon has the highest IBU.

#we can observe that Colorado as a state has the highest ABV and Oregon has the highest IBU in the US
#sanity check see output of max for yourself for ABV and IBU to compare against Bar Chart
#State with max ABV

#This code uses the sprintf() function with the format string "%3f", which specifies that the output should have three decimal places. The max() function is used to get the maximum ABV value from the max_abv dataframe, and the na.rm = TRUE argument is included to remove any missing values (represented by NA) from the calculation.
#The cat() function is then used to display the maximum ABV value with a descriptive message. You can modify the message as desired.
#found that colorado has the beer with Highest ABV so going to impute any missing values for this beer so its included into our analysis later
max_abv_value <- sprintf("%.3f", max(merged_data$ABV, na.rm = TRUE))
cat("Maximum ABV value:", max_abv_value)
## Maximum ABV value: 0.125
max_ABV_observation <- which.max(merged_data$ABV)
state_with_max_ABV <- merged_data$State[max_ABV_observation]
#State with max ABV
state_with_max_ABV
## [1] " KY"
max_IBU_observation <- which.max(merged_data$IBU)
state_with_max_IBU <- merged_data$State[max_IBU_observation]
#State with max IBU
state_with_max_IBU
## [1] " OR"
#Group by state and calculate maximum ABV
grouped_data <- merged_data %>% 
  group_by(State) %>% 
  summarize(max_ABV = max(ABV, na.rm = TRUE)) %>%
  filter(!is.na(max_ABV)) %>% # Remove any rows with missing max values
  arrange(desc(max_ABV)) %>% # Sort by max ABV in descending order
  head(5) # Select top 5 states by max ABV

#Plot the bar chart of top 5 states in ascending order from left to right
#We can see the state of Colorado takes the number one spot, with an ABV of 12.8%. 
gg <- ggplot(grouped_data, aes(x = reorder(State, max_ABV), y = max_ABV, fill = State)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 5 States with Most Alcoholic (ABV) Beer",
       x = "State",
       y = "Max ABV") +
  theme_classic() +
  theme(axis.text.x = element_text(size = 13, angle = 90, vjust = 1, hjust = 0.5), axis.text.y=element_text(size = 13), text=element_text(size = 20)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  scale_fill_discrete(name = "State")
ggplotly(gg)
#Group by state and calculate maximum IBU
grouped_data <- merged_data %>% 
  group_by(State) %>% 
  summarize(max_IBU = max(IBU, na.rm = TRUE)) %>%
  filter(!is.na(max_IBU)) %>% # Remove any rows with missing max values
  arrange(desc(max_IBU)) %>% # Sort by max ABV in descending order
  head(5) # Select top 5 states by max ABV

#Plot the bar chart of top 5 states in ascending order from left to right
#On the other hand, from this bar plot, for the top 5 most bitter (IBU) beers we can see Oregon to be the winner this time, with a whopping IBU score of 138.
gg<- ggplot(grouped_data, aes(x = reorder(State, max_IBU), y = max_IBU, fill = State)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 5 States with Most Bitter (IBU) Beer",
       x = "State",
       y = "Max IBU") +
  theme_classic() +
  theme(axis.text.x = element_text(size = 13,angle = 90, vjust = 1, hjust = 0.5), axis.text.y=element_text(size=13), text=element_text(size=20)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  scale_fill_discrete(name = "State")
ggplotly(gg)

#6. Comment on the summary statistics and distribution of the ABV variable.
#Solution:
#Here we can see a table of the summary statistics.
#Observe that we have the minimum ABV level of 0% for non-alcoholic beers, a maximum ABV level of 12.8%, a Median ABV level of 5.6% and a Mean ABV level of 5.97%. #Here we see a histogram of ABV values. Observe that there is a wide range of values for the ABV variable, with many of the value in the range of 4-7%. The distribution appears to be unimodal. Also notice the histogram is roughly symmetric except for the long right tail, which indicates that there are some beers with high ABV values that are considered outliers in this case a Colorado beer.
#In general, the ABV variable shows a reasonable range and distribution of values for beer, with most of the beers having a relatively moderate alcohol content.

#From the summary statistics and the distribution plot, we can see that the ABV variable has a minimum value of 0%, a maximum value of 12.7%, and a median value of 5.6%. The mean value is slightly higher at 5.97%, which suggests that the distribution is positively skewed, as can be seen from the longer tail on the right side of the histogram.

# Calculate summary statistics for ABV
abv_summary <- summary(merged_data$ABV)
abv_summary
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02700 0.05000 0.05700 0.05991 0.06800 0.12500
#Standard Deviation of ABV for merged_data data set
sd(merged_data$ABV)
## [1] 0.01357633
#Plot the histogram of ABV
#Here we can see the distribution of the ABV values expressed as a histogram. Observe that there is a wide range of values for the ABV variable, with the majority of the values concentrated in the range of 4-7%. The distribution appears to be unimodal. Also notice the histogram is roughly symmetric except for the long right tail, which indicates that there are some beers with high ABV values that are considered outliers.
#In general, the ABV variable shows a reasonable range and distribution of values for beer, with the majority of beers having a relatively moderate alcohol content. However, there are some extreme values that might require further investigation, as they could be due to errors in data entry or some other factors.But not the case I found a one beer in Colorado that has an ABV of 12.8%, which is the outlier.

gg <- ggplot(merged_data, aes(x = ABV)) +
  geom_histogram(color = "white", fill = "red", bins = 30,
                 na.rm = TRUE) + 
  xlab("ABV") +
  ylab("Count") +
  labs(title = "Histogram of ABV") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5, size = 25),
        axis.text.x = element_text(size = 20),
        axis.text.y = element_text(size = 20),
        axis.title.x = element_text(size = 25),
        axis.title.y = element_text(size = 25))
ggplotly(gg)

#7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.
#Solution:
#Now we explored if there exist an apparent relationship between the bitterness of the beer and its alcoholic content. We can observe this relationship between the alcoholic content (ABV) and bitterness (IBU) of the beers.
#The gray dots represent individual beers, and the red line represents a smoothed curve fit of the data.
#Overall, there appears to be a slight positive relationship associated between ABV and IBU, but we can see a downward trend later since we account for an outlier that has a high ABV but lower IBU. As the alcoholic content of beer increases, so does its bitterness, as indicated by the general upward trend of the red curve. However, the relationship is not very strong, as there is quite a bit of variability in IBU values for any given ABV value. This suggests that while there is some relationship associated between ABV and IBU, other factors such as the type of beer, brewing methods, and ingredient choices are also important in determining the bitterness of a beer.

library(ggplot2)

merged_data <- merge(Breweries, Beers, by.x = "Brew_ID", by.y = "Brewery_id", all = TRUE)
#run na.omit to make the merged_data free of na values this is very important since KNN model will not run if it identifies missing values.
merged_data <- na.omit(merged_data)
#Scatter plot of ABV and IBU with Local polynomial regression
#This scatter plot shows the relationship between the alcoholic content (ABV) and bitterness (IBU) of beer. The gray dots represent individual beers, and the red line represents a smoothed curve fit to the data using the loess method (Local Polynomial regression).
#Overall, there appears to be a slight positive relationship associated between ABV and IBU. As the alcoholic content of beer increases, so does its bitterness, as indicated by the general upward trend of the red curve. However, the relationship is not very strong, as there is quite a bit of variability in IBU values for any given ABV value. This suggests that while there is some relationship between ABV and IBU, other factors such as the type of beer, brewing methods, and ingredient choices are also important in determining the bitterness of a beer.

gg <- ggplot(merged_data, aes(x = ABV, y = IBU)) +
  geom_point(color = "darkgray", position = "jitter") +
  geom_smooth(method = "loess", se = FALSE, color = "red") +
  labs(title = "Scatter Plot of ABV and IBU",
       x = "ABV",
       y = "IBU") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5, size = 12), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12), axis.text=element_text(size=30))
ggplotly(gg)
## `geom_smooth()` using formula = 'y ~ x'

#8. Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). You decide to use KNN classification to investigate this relationship. Provide statistical evidence one way or the other. In addition, while you have decided to use KNN to investigate this relationship (KNN is required) you may also feel free to supplement your response to this question with any other methods or techniques you have learned. Creativity and alternative solutions are always encouraged.
#Solution
#Furthermore, we wanted to identify whether there is a relationship between a beer’s ABV and IBU values and its classification as an IPA or Ale. To answer this question, we used a machine learning technique called K-Nearest Neighbors to classify beers as either an IPA or Ale based on their ABV and IBU values. To do this, we first ran this analysis multiple times with different training and testing data sets and different values of k, which is the number of nearest neighbors used in the classification. Here we happened to see our best k value is 5 with an accuracy rate around 86%.
#The confusion matrix here is a two by two table that summarizes the performance of a classification model by comparing its predictions to the actual labels in a test dataset. Here the rows correspond to the actual classifications and the columns correspond to the predicted classifications. Here we treat Ale as being the positive classification and IPA being the negative classification. The four possible outcomes are:
#True positive (TP): The model correctly predicts Ale in this case when the actual is Ale. So, we have 133 beers correctly classified as Ale
#False positive (FP): The model predicts Ale when the actual is IPA. So, we have 31 beers incorrectly classified as Ale’s when they were really IPA’s (type 1 error)
#False negative (FN): The model predicts IPA when the actual is ALE. So, we have 13 beers that are actually ALE’S but were predicted to be IPAs. (type 2 error)
#And
#True negative (TN): The model correctly predicts IPA in this case when the actual is IPA. So, we have 108 beers that are correctly classified as not being ALE’s.
#We can observe that the statistic performance results of this KNN model accurately classifies a beer as an ALE or IPA with an 84 percent accuracy rate, which is pretty good. So, we are confident that this Machine learning KNN model is correctly classifying the beers based of the associated relationship of ABV and IBU.
#We can visualize the decision boundary for the KNN model, which is a straight line that separates the different types of beers based on their ABV and IBU values. The KNN model uses this boundary to classify new beers as either Ales or IPAs. In general, we can see that IPAs tend to have higher IBU and ABV ranges, but there seems to be also some overlap between the two groups. The decision boundary is set in such a way as to maximize the accuracy of the KNN model in classifying the beers

# First we wanted to show different categories of beers of what we are working with.
# add lager, stout, pilsner, zymurgy, shandy, porter to the broader style list
# notice merged_Data will have a column called IPAALE to categorize each beer as an IPA, ALE, LAGER, STOUT, PILSNER, ZYMURGY, SHANDY, PORTER, OR NEITHER and get these key words if they are found in the style column of the style name using the grepl function.
merged_data$IPAAle = case_when(grepl("\\bIPA\\b", merged_data$Style, ignore.case = TRUE) ~ "IPA",
                             grepl("\\bindia pale ale\\b", merged_data$Style, ignore.case = TRUE) ~ "IPA",
                             grepl("\\bale\\b", merged_data$Style, ignore.case = TRUE ) ~ "Ale",
                             grepl("\\blager\\b", merged_data$Style, ignore.case = TRUE) ~ "Lager",
                             grepl("\\bstout\\b", merged_data$Style, ignore.case = TRUE) ~ "Stout",
                             grepl("\\bpilsner\\b", merged_data$Style, ignore.case = TRUE) ~ "Pilsner",
                             grepl("\\bzymurgy\\b", merged_data$Style, ignore.case = TRUE) ~ "Zymurgy",
                             grepl("\\bshandy\\b", merged_data$Style, ignore.case = TRUE) ~ "Shandy",
                             grepl("\\bporter\\b", merged_data$Style, ignore.case = TRUE) ~ "Porter",
                             TRUE ~ "Neither")
#All the categories of beers listed are merged here into merged_data
merged_data$IPAAle <- as.factor(merged_data$IPAAle)
#check what columns are in the merged data so far
head(merged_data)
##   Brew_ID             Name.x        City State        Name.y Beer_ID   ABV IBU
## 1       1 NorthGate Brewing  Minneapolis    MN       Pumpion    2689 0.060  38
## 2       1 NorthGate Brewing  Minneapolis    MN    Stronghold    2688 0.060  25
## 3       1 NorthGate Brewing  Minneapolis    MN   Parapet ESB    2687 0.056  47
## 4       1 NorthGate Brewing  Minneapolis    MN  Get Together    2692 0.045  50
## 5       1 NorthGate Brewing  Minneapolis    MN Maggie's Leap    2691 0.049  26
## 6       1 NorthGate Brewing  Minneapolis    MN    Wall's End    2690 0.048  19
##                                 Style Ounces  IPAAle
## 1                         Pumpkin Ale     16     Ale
## 2                     American Porter     16  Porter
## 3 Extra Special / Strong Bitter (ESB)     16 Neither
## 4                        American IPA     16     IPA
## 5                  Milk / Sweet Stout     16   Stout
## 6                   English Brown Ale     16     Ale
#get names of the columns of the merged_data
names(merged_data)
##  [1] "Brew_ID" "Name.x"  "City"    "State"   "Name.y"  "Beer_ID" "ABV"    
##  [8] "IBU"     "Style"   "Ounces"  "IPAAle"
#Create new Data Frame so not to confuse
#Filter Ales and IPAs only
buzzKNN <- dplyr::filter(merged_data, IPAAle == "IPA" | IPAAle == "Ale")
iterations = 45
numks = 25
#70-30 split
splitPerc = .70
set.seed(123)
masterAcc = matrix(nrow = iterations, ncol = numks)
#Now we wanted to identify whether there is a relationship between a beer's ABV and IBU values and its classification as an IPA or Ale. To answer this question, we used a machine learning technique called K-Nearest Neighbors (KNN) to classify beers as either an IPA or Ale based on their ABV and IBU values. To do this, we first ran this analysis multiple times with different training and testing data sets and different values of k, which is the number of nearest neighbors used in the classification. Here we happened to see our value for k is 5 with an accuracy rate around 86%.
for(j in 1:iterations)
{
  trainIndices = sample(1:dim(buzzKNN)[1],round(splitPerc * dim(buzzKNN)[1]))
  train = buzzKNN[trainIndices,]
  test = buzzKNN[-trainIndices,]
  for(i in 1:numks)
  {
    classifications = knn(train[,c(7,8)],test[,c(7,8)],train$IPAAle, prob = TRUE, k = i)
    table(classifications,test$IPAAle)
    CM = confusionMatrix(table(classifications,test$IPAAle))
    masterAcc[j,i] = CM$overall[1]
  }
}
MeanAcc = colMeans(masterAcc)
MeanAcc
##  [1] 0.8412250 0.8333726 0.8483706 0.8525324 0.8606203 0.8563015 0.8522183
##  [8] 0.8478995 0.8437377 0.8428740 0.8428740 0.8432666 0.8413035 0.8422458
## [15] 0.8411464 0.8418532 0.8393404 0.8380840 0.8368276 0.8370632 0.8376914
## [22] 0.8372988 0.8372988 0.8380840 0.8383196
#Visually find the best value of k by using it's location in the data frame based on the highest Mean value
plot(seq(1,numks,1),MeanAcc, type = "l", 
     col = "#c8201e",
     main = "Value for K Neighbors vs Accuracy", 
     sub = "Budweiser Consultation",
     xlab = "Value of K Neighbors",
     ylab = "Accuracy Rate (Percentage)")

# Locate the value of k based on the best MeanAcc in the dataframe
#match function do Research?
kvalue = match(max(MeanAcc), MeanAcc)
max(MeanAcc)
## [1] 0.8606203
kvalue
## [1] 5
####### Best value of k = 5 around 0.8646394 Mean Accuracy #####################
####### Train the model using kvalue #####################
#Confusion Matrix and Statistics( read Title)
#The confusion matrix here is a two by two table that summarizes the performance of a classification model by comparing its predictions to the actual labels in a test dataset. Here the rows correspond to the actual classifications and the columns correspond to the predicted classifications. The four possible outcomes are:
#True positive (TP): The model correctly predicts a positive classification Ale in this case when the actual classification is positive or is Ale. So we have 133 beers correctly classified as Ale
#False positive (FP): The model predicts a positive classification ale when the actual classification is negative not an Ale but an IPA. So we have 31 beers incorrectly classified as Ale’s when they were really IPA’s (type 1 error)
#False negative (FN): The model predicts a negative classification IPA in this case when the actual classification is positive or an ALE. So we have 13 beers that are actually  ALE’S but were predicted to be IPAs. (type 2 error)
#And finally we have
#True negative (TN): The model correctly predicts a negative classification IPA in this case when the actual classification is negative or an IPA. So we have 108 beers that are correctly classified as not being ALE’s.
#We can observe that the statistic performance results of this KNN model accurately classifies a beer as an ALE or IPA with an 84 percent accuracy rate, which is pretty good. So we can have confidence this Machine learning KNN model is correctly classifying the beers based of the associated relationship of ABV and IBU. 
classifications = knn(train[,c(7,8)],test[,c(7,8)],train$IPAAle, prob = TRUE, k = kvalue)
table(classifications,test$IPAAle)
##                
## classifications Ale IPA Lager Neither Pilsner Porter Stout
##         Ale     150  13     0       0       0      0     0
##         IPA      23  97     0       0       0      0     0
##         Lager     0   0     0       0       0      0     0
##         Neither   0   0     0       0       0      0     0
##         Pilsner   0   0     0       0       0      0     0
##         Porter    0   0     0       0       0      0     0
##         Stout     0   0     0       0       0      0     0
CM = confusionMatrix(table(classifications,test$IPAAle))
### Get details of test using CM ####
#We can observe here our model has an Accuracy of around 84%, which is really good.
CM
## Confusion Matrix and Statistics
## 
##                
## classifications Ale IPA Lager Neither Pilsner Porter Stout
##         Ale     150  13     0       0       0      0     0
##         IPA      23  97     0       0       0      0     0
##         Lager     0   0     0       0       0      0     0
##         Neither   0   0     0       0       0      0     0
##         Pilsner   0   0     0       0       0      0     0
##         Porter    0   0     0       0       0      0     0
##         Stout     0   0     0       0       0      0     0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8728          
##                  95% CI : (0.8283, 0.9093)
##     No Information Rate : 0.6113          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7367          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Ale Class: IPA Class: Lager Class: Neither
## Sensitivity              0.8671     0.8818           NA             NA
## Specificity              0.8818     0.8671            1              1
## Pos Pred Value           0.9202     0.8083           NA             NA
## Neg Pred Value           0.8083     0.9202           NA             NA
## Prevalence               0.6113     0.3887            0              0
## Detection Rate           0.5300     0.3428            0              0
## Detection Prevalence     0.5760     0.4240            0              0
## Balanced Accuracy        0.8744     0.8744           NA             NA
##                      Class: Pilsner Class: Porter Class: Stout
## Sensitivity                      NA            NA           NA
## Specificity                       1             1            1
## Pos Pred Value                   NA            NA           NA
## Neg Pred Value                   NA            NA           NA
## Prevalence                        0             0            0
## Detection Rate                    0             0            0
## Detection Prevalence              0             0            0
## Balanced Accuracy                NA            NA           NA
dim(merged_data)
## [1] 1405   11
## [1] IPA IPA Ale Ale IPA IPA IPA
## attr(,"prob")
## [1] 0.8000000 0.8333333 0.5000000 1.0000000 1.0000000 0.8000000 1.0000000
## Levels: Ale IPA Lager Neither Pilsner Porter Stout
#We can observe the probability for this random set is 0.8000000 0.8333333 0.5000000 1.0000000 1.0000000 0.8000000 1.0000000 data pulled for ABV and IBU columns are very similar to the accuracy of our knn model model. So this gives us higher confidence that our knn model is classifying the beers correctly as  ALE or India Pale Ale.
classifyMyBeers <- data.frame(ABV = c(6,6,5,4,5, 12, 7), 
                              IBU = c(78, 65, 55, 38, 100, 148, 98))
classifications = knn(train[,c(7,8)],classifyMyBeers,train$IPAAle, prob = TRUE, k = kvalue)
classifications
## [1] IPA IPA Ale Ale IPA IPA IPA
## attr(,"prob")
## [1] 0.8000000 0.8000000 0.6000000 0.8000000 0.8333333 1.0000000 0.8000000
## Levels: Ale IPA Lager Neither Pilsner Porter Stout
############# Summary data by classification #############
IPAAleSummary <- buzzKNN %>% 
  group_by(IPAAle) %>% 
  dplyr::summarize(ABV.min = min(ABV), 
                   ABV.med = median(ABV),
                   ABV.max = max(ABV), 
                   IBU.min = min(IBU), 
                   IBU.med = median(IBU),
                   IBU.max = max(IBU))
view(IPAAleSummary)
IPAAleSummary
## # A tibble: 2 × 7
##   IPAAle ABV.min ABV.med ABV.max IBU.min IBU.med IBU.max
##   <fct>    <dbl>   <dbl>   <dbl>   <int>   <dbl>   <int>
## 1 Ale      0.035   0.054   0.099       4      30     115
## 2 IPA      0.038   0.068   0.099      30      70     138
########################################################
##### Replot and color by beer style ###################
#this plot shows the relationship of ABV and IBU for IPA and ALEs
#Linear Correlation Model for KNN
#We can see the decision boundary for the KNN model, which is a straight line that separates the different types of beers based on their ABV and IBU values. The KNN model uses this boundary to classify new beers as either Ales or IPAs. In general, we can see that IPAs tend to have higher IBU and ABV ranges, but there seems to be also some overlap between the two groups. The decision boundary is set in such a way as to maximize the accuracy of the KNN model in classifying new beers.
comparisonCoef <- coef(lm(ABV ~ IBU, buzzKNN))
comparisonCoef
##  (Intercept)          IBU 
## 0.0448459578 0.0003390537
buzzKNN %>% 
ggplot(aes(x = IBU, y = ABV, color = IPAAle)) +
  geom_point(show.legend = TRUE, na.rm = TRUE, position = "jitter") +
  geom_abline(intercept =  comparisonCoef[1] , slope = comparisonCoef[2], color = "#c8102E", size = 1) +
  theme_classic() + 
  labs(title = "IBU vs ABV", 
       subtitle = "MSDS Group Consultation",
       y = "Alcohol By Volume", 
       x = "International Bitterness Unit",
       caption="ABV and IBU values imputed where necessary.") +
  scale_color_manual(values = c("#c8102e","#13294b","#b1b3b3"),
                     name = "Type of Beer",
                     breaks = c("Ale", "IPA", "Neither"),
                     labels = c("Ale", "IPA", "Neither"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

#Alternative technique for classification
#We implemented a Naive Bayes classifier based on IBU and ABV as a predictor of categorization of style of beer (IPA,Ale, or Neither)
#Create new data frame "bayesDat for Naive Bayes classifier (Don't want to interfere with original merge_data Data Frame)
library(dplyr)
#Use dim function to see the number of observations we are working with.
dim(merged_data)
## [1] 1405   11
bayesDat <- dplyr::filter(merged_data, IPAAle == "IPA" | IPAAle == "Ale")
dim(bayesDat)
## [1] 944  11
#Make the classifier run properly by converting outcome to a factor
bayesDat$IPAAle <- as.factor(bayesDat$IPAAle)
#Run this loop to run classifier 100 times to determine mean accuracy
iterations = 100
masterAcc = matrix(nrow = iterations,ncol=3)
#Begin the loop
for(j in 1:iterations)
{
  #change seed each iteration
  set.seed(j)
  #Determine training and testing indicies  
  #70-30 split here 
  trainIndices = sample(seq(1:length(bayesDat$IPAAle)),round(.7*length(bayesDat$IPAAle)))
  trainBeer = bayesDat[trainIndices,]
  testBeer = bayesDat[-trainIndices,]
  #Generate model, table, and confusion matrix
  model = naiveBayes(trainBeer[,c(7,8)],trainBeer$IPAAle)
  table(predict(model,testBeer[,c(7,8)]),testBeer$IPAAle)
  CM = confusionMatrix(table(predict(model,testBeer[,c(7,8)]),testBeer$IPAAle))
  #Insert current accuracies
  masterAcc[j,1] = CM$overall[1]
  masterAcc[j,2] = CM$byClass[1]
  masterAcc[j,3] = CM$byClass[2]
}
#Mean accuracy
MeanAcc = colMeans(masterAcc)
MeanAcc
## [1] 0.8472792 0.8713822 0.8125263
#Confusion matrix
CM
## Confusion Matrix and Statistics
## 
##          
##           Ale IPA Lager Neither Pilsner Porter Stout
##   Ale     132  28     0       0       0      0     0
##   IPA      29  94     0       0       0      0     0
##   Lager     0   0     0       0       0      0     0
##   Neither   0   0     0       0       0      0     0
##   Pilsner   0   0     0       0       0      0     0
##   Porter    0   0     0       0       0      0     0
##   Stout     0   0     0       0       0      0     0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7986          
##                  95% CI : (0.7471, 0.8437)
##     No Information Rate : 0.5689          
##     P-Value [Acc > NIR] : 3.13e-16        
##                                           
##                   Kappa : 0.5898          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Ale Class: IPA Class: Lager Class: Neither
## Sensitivity              0.8199     0.7705           NA             NA
## Specificity              0.7705     0.8199            1              1
## Pos Pred Value           0.8250     0.7642           NA             NA
## Neg Pred Value           0.7642     0.8250           NA             NA
## Prevalence               0.5689     0.4311            0              0
## Detection Rate           0.4664     0.3322            0              0
## Detection Prevalence     0.5654     0.4346            0              0
## Balanced Accuracy        0.7952     0.7952           NA             NA
##                      Class: Pilsner Class: Porter Class: Stout
## Sensitivity                      NA            NA           NA
## Specificity                       1             1            1
## Pos Pred Value                   NA            NA           NA
## Neg Pred Value                   NA            NA           NA
## Prevalence                        0             0            0
## Detection Rate                    0             0            0
## Detection Prevalence              0             0            0
## Balanced Accuracy                NA            NA           NA
#see the number of observations allocated to train and test data frames 
dim(trainBeer)
## [1] 661  11
dim(testBeer)
## [1] 283  11

#9. Knock their socks off! Find one other useful inference from the data that you feel Budweiser may be able to find value in. You must convince them why it is important and back up your conviction with appropriate statistical evidence.
#We can infer from this heat map that Budweiser could adjust their beer offerings to include more IPAs, particularly those with high IBU values, to meet customer demand and capitalize on this popular beer style.

merged_data <- merge(Breweries, Beers, by.x = "Brew_ID", by.y = "Brewery_id", all = TRUE)
merged_data <- na.omit(merged_data)
library(tm) #text mining library provides the stopwords() function
library(tidyr)
library(plyr)
library(jsonlite)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(stringr)
library(plotly)
library(ggthemes)
library(caret)
library(class)
library(e1071)
library(usmap)
merged_data$IPAAles = case_when(grepl("\\bIPA\\b", merged_data$Style, ignore.case = TRUE) ~ "IPA",
grepl("\\bindia pale ale\\b", merged_data$Style, ignore.case = TRUE) ~ "IPA",
grepl("\\bale\\b", merged_data$Style, ignore.case = TRUE ) ~ "Ale",
grepl("\\blager\\b", merged_data$Style, ignore.case = TRUE) ~ "Lager",
grepl("\\bstout\\b", merged_data$Style, ignore.case = TRUE) ~ "Stout",
grepl("\\bpilsner\\b", merged_data$Style, ignore.case = TRUE) ~ "Pilsner",
grepl("\\bzymurgy\\b", merged_data$Style, ignore.case = TRUE) ~ "Zymurgy",
grepl("\\bshandy\\b", merged_data$Style, ignore.case = TRUE) ~ "Shandy",
grepl("\\bporter\\b", merged_data$Style, ignore.case = TRUE) ~ "Porter",
TRUE ~ "Neither")
merged_data$IPAAles <- as.factor(merged_data$IPAAles)

# plot style by ABV
beermeABV <- merged_data %>%
mutate(ABVControlFact = cut(ABV, breaks = c(0,5,6.7,12.8),
labels = c("Low ABV","Average ABV","High ABV"))) %>%
count(IPAAles, ABVControlFact) %>%
ggplot(aes(x=reorder(IPAAles, -n), ABVControlFact)) +
geom_tile(mapping = aes(fill = n)) +
scale_fill_gradient(low = "#fff5f0", high = "#c8102e") +
labs(title = "ABV Assessment of Popular Beer Styles",
subtitle = "MSDS Consulting",
y = NULL,
x = "Styles of Beer",
caption="ABV values imputed where necessary.") +
theme_classic()

#We can infer from this heat map that Budweiser could adjust their beer offerings to include more IPAs, particularly those with high IBU values, to meet customer demand and capitalize on this popular beer style.
# plot style by IBU
beermeIBU <- merged_data %>%
mutate(IBUControlFact = cut(IBU, breaks = c(-10,21,57, 138),
labels = c("Low IBU","Average IBU","High IBU"))) %>%
count(IPAAles, IBUControlFact) %>%
ggplot(aes(x=reorder(IPAAles, -n), IBUControlFact)) +
geom_tile(mapping = aes(fill = n)) +
scale_fill_gradient(low = "#AFDBF5", high = "#012169") +
labs(title = "IBU Assessment of Popular Beer Styles",
subtitle = "MSDS Consulting Group",
y = NULL,
x = "Styles of Beer",
caption="IBU values imputed where necessary.") +
theme_classic()
ggplotly(beermeIBU)